"""
用爬虫技术抓取测试文章
url：http://42.192.62.88:30166/
"""

import requests
from lxml import etree

# 初始化url
url = 'http://42.192.62.88:30166/'
# 发送请求
response = requests.get(url)
# print(response.text)
# 字符集转换
# code = response.apparent_encoding
# print(code)
# 设置页面编码格式
# response.encoding = 'gbk'
content = response.text
# print(content)
# 将页面信息转化为dom格式
doc = etree.HTML(content)
print(doc)
# 创建文件
file = open('data.txt', 'w')
# 抓取元素
for j in range(2, 4):
    # 写入
    file.write(f"第{j-1}页\n")
    for i in range(2, 9):
        ele = doc.xpath(f'/html/body/div[6]/div[3]/div/div[{i}]/p/text()')[0]
        # 打印输出元素
        print(i, ele)
        # 删除不可识别字符
        element = ''.join(ele.split())
        file.write(f"第{i}行内容:{element}\n")
    # 下一页
    url = f'http://42.192.62.88:30166/'
    response = requests.get(url)
    content = response.text
    doc = etree.HTML(content)
file.close()
